# Load libraires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# let load dataset
alleghemy_county_employee_salary_df = pd.read_csv(r"C:\Users\jki\Downloads\redacted-2022-december-31-wprdc.csv")
alleghemy_county_employee_salary_df.head(5)


alleghemy_county_employee_salary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6280 entries, 0 to 6279
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FIRST_NAME        6280 non-null   object
 1   LAST_NAME         6280 non-null   object
 2   Combo Name        6280 non-null   object
 3   DEPARTMENT        6280 non-null   object
 4   JOB_TITLE         6280 non-null   object
 5   ELECTED_OFFICIAL  6280 non-null   int64 
 6   DATE_STARTED      6280 non-null   object
 7   SEX               6280 non-null   object
 8   ETHNICITY         6280 non-null   object
 9   ORIG_START        6280 non-null   object
 10  DATE_TERM         1263 non-null   object
 11  PAY_STATUS        6280 non-null   object
 12   ANNUAL_SALARY    6280 non-null   object
 13   REGULAR_PAY      6280 non-null   object
 14   OVERTIME_PAY     6280 non-null   object
 15   INCENTIVE_PAY    6280 non-null   object
 16   GROSS_PAY        6280 non-null   object
 17  Unnamed: 17       1 non-null      object
dtypes: int64(1), object(17)
memory usage: 883.3+ KB


# lets remove spaces on the column names

alleghemy_county_employee_salary_df.rename(columns=lambda x: x.strip().replace(" ", "_"), inplace=True)
alleghemy_county_employee_salary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6280 entries, 0 to 6279
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FIRST_NAME        6280 non-null   object
 1   LAST_NAME         6280 non-null   object
 2   Combo_Name        6280 non-null   object
 3   DEPARTMENT        6280 non-null   object
 4   JOB_TITLE         6280 non-null   object
 5   ELECTED_OFFICIAL  6280 non-null   int64 
 6   DATE_STARTED      6280 non-null   object
 7   SEX               6280 non-null   object
 8   ETHNICITY         6280 non-null   object
 9   ORIG_START        6280 non-null   object
 10  DATE_TERM         1263 non-null   object
 11  PAY_STATUS        6280 non-null   object
 12  ANNUAL_SALARY     6280 non-null   object
 13  REGULAR_PAY       6280 non-null   object
 14  OVERTIME_PAY      6280 non-null   object
 15  INCENTIVE_PAY     6280 non-null   object
 16  GROSS_PAY         6280 non-null   object
 17  Unnamed:_17       1 non-null      object
dtypes: int64(1), object(17)
memory usage: 883.3+ KB


# lets change the data types to perfome numerical calculations
# Replace 'GROSS_PAY' with your actual column name
alleghemy_county_employee_salary_df['GROSS_PAY'] = (
    alleghemy_county_employee_salary_df['GROSS_PAY']
    .str.replace(',', '')  # Remove commas
    .str.strip()           # Remove leading and trailing spaces
    .astype(float)         # Convert to numeric type
)

# Convert 'DATE_STARTED' to datetime
alleghemy_county_employee_salary_df['DATE_STARTED'] = pd.to_datetime(alleghemy_county_employee_salary_df['DATE_STARTED'])

# Display the DataFrame info after converting data types
alleghemy_county_employee_salary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6280 entries, 0 to 6279
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   FIRST_NAME        6280 non-null   object        
 1   LAST_NAME         6280 non-null   object        
 2   Combo_Name        6280 non-null   object        
 3   DEPARTMENT        6280 non-null   object        
 4   JOB_TITLE         6280 non-null   object        
 5   ELECTED_OFFICIAL  6280 non-null   int64         
 6   DATE_STARTED      6280 non-null   datetime64[ns]
 7   SEX               6280 non-null   object        
 8   ETHNICITY         6280 non-null   object        
 9   ORIG_START        6280 non-null   object        
 10  DATE_TERM         1263 non-null   object        
 11  PAY_STATUS        6280 non-null   object        
 12  ANNUAL_SALARY     6280 non-null   object        
 13  REGULAR_PAY       6280 non-null   object        
 14  OVERTIME_PAY      6280 non-null   object        
 15  INCENTIVE_PAY     6280 non-null   object        
 16  GROSS_PAY         6280 non-null   float64       
 17  Unnamed:_17       1 non-null      object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(15)
memory usage: 883.3+ KB


# Get the count of each department
department_counts = alleghemy_county_employee_salary_df['DEPARTMENT'].value_counts()

# Display the counts

print(department_counts)

Human Services             935
Kane Regional Centers      814
Parks                      732
Jail                       635
Emergency Management       404
Health                     401
Police                     298
District Attorney          266
Public Works Operating     240
Facilities Management      230
Administrative Services    217
Sheriff                    210
Public Defender            162
Court Records              122
Information Technology     115
Medical Examiner           105
Controller                  92
Treasurer                   79
Economic Development        78
County Solicitor            57
Human Resources             27
County Manager              20
Budget & Finance             9
Equity and Inclusion         9
County Council               7
Children Initiatives         7
Retirement System            4
Sustainability               3
County Executive             2
Name: DEPARTMENT, dtype: int64


# Get the count of each department
department_counts = alleghemy_county_employee_salary_df['JOB_TITLE'].value_counts()

# Display the counts

print(department_counts)

SEASONAL AIDE                     552
CORRECTIONAL OFFICER              406
TELECOMMUNICATION OFFICER         314
LABORER                           195
POLICE OFFICER                    173
                                 ... 
PUBLIC HEALTH PROGRM REP SUPV       1
HIV PREP NAVIGATOR S - 3 YEARS      1
MGR CONTRACTS & PROCUREMENT         1
DEP DIRECTOR FACILITIES MAINT       1
GIS ANALYST 5 - 10 YEARS            1
Name: JOB_TITLE, Length: 1133, dtype: int64


# Get the count of each department
department_counts = alleghemy_county_employee_salary_df['ETHNICITY'].value_counts()

# Display the counts

print(department_counts)

White (Not of Hispanic Origin)    5140
Black                              962
Asian or Pacific Islander           82
Hispanic                            54
Two or More Races                   37
American Indian/Alaskan Native       5
Name: ETHNICITY, dtype: int64


import locale

# Set the locale to your preferred format (e.g., US English)
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

# Calculate the rounded max
rounded_mean = round(alleghemy_county_employee_salary_df['GROSS_PAY'].mean())

# Format and display as an accounting value
formatted_mean = locale.currency(rounded_mean, grouping=True)
print(formatted_mean)

$51,056.00


import locale

# Set the locale to your preferred format (e.g., US English)
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

# Calculate the rounded max
rounded_max = round(alleghemy_county_employee_salary_df['GROSS_PAY'].max())

# Format and display as an accounting value
formatted_max = locale.currency(rounded_max, grouping=True)
print(formatted_max)

$269,356.00


# Load in some packages
import calendar
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
from collections import Counter

# Group by 'Department' and calculate the sum of 'Gross Pay'
sum_grosspay_department = alleghemy_county_employee_salary_df.groupby('DEPARTMENT')['GROSS_PAY'].sum()

# Sort the values in descending order and select the top ten
top_ten_departments = sum_grosspay_department.sort_values(ascending=False).head(10)

# Display the result
print(top_ten_departments)

# Plot the top ten counties
top_ten_departments.plot(kind='bar', color='purple', figsize=(10, 6))
plt.title('Top Ten Departments with highest Salaries')
plt.xlabel('Depatment')
plt.ylabel('Gross Pay')
plt.show()

DEPARTMENT
Human Services            46821423.98
Jail                      45023879.00
Kane Regional Centers     37212378.06
Police                    29311931.28
Emergency Management      21382373.61
Health                    18533035.83
Sheriff                   18488717.69
District Attorney         13961326.76
Public Works Operating    12482802.12
Facilities Management     11226207.42
Name: GROSS_PAY, dtype: float64


# Group by 'JobTitles' and calculate the sum of 'Gross Pay'
sum_grosspay_jobtitles = alleghemy_county_employee_salary_df.groupby('JOB_TITLE')['GROSS_PAY'].sum()

# Sort the values in descending order and select the top ten
top_ten_jobtitles = sum_grosspay_jobtitles.sort_values(ascending=False).head(10)

# Display the result
print(top_ten_jobtitles)

# Plot the top ten counties
top_ten_jobtitles.plot(kind='bar', color='purple', figsize=(10, 6))
plt.title('Top Ten Job Titles with highest Salaries')
plt.xlabel('Job Titles')
plt.ylabel('Gross Pay')
plt.show()

JOB_TITLE
CORRECTIONAL OFFICER            31844763.72
POLICE OFFICER                  19729808.78
TELECOMMUNICATION OFFICER       16469221.08
DEPUTY SHERIFF                  14001177.64
LABORER                          9089410.60
CASEWORKER SUPERVISOR            8421962.05
SERGEANT                         8199551.59
NURSING ASSISTANT                7836780.83
CASEWORKER - CYF (lng Step)      5066712.26
RN RESIDENT CARE COORDINATOR     3708652.51
Name: GROSS_PAY, dtype: float64


# Group by 'Ethnicities' and calculate the sum of 'Gross Pay'
sum_grosspay_ethnicities = alleghemy_county_employee_salary_df.groupby('ETHNICITY')['GROSS_PAY'].sum()

# Sort the values in descending order and select the top ten
top_ten_ethnicities = sum_grosspay_ethnicities.sort_values(ascending=False).head(10)

# Display the result
print(top_ten_ethnicities)

# Plot the top ten counties
top_ten_ethnicities.plot(kind='bar', color='purple', figsize=(10, 6))
plt.title('Top Ten top Ethnicities with highest Salaries')
plt.xlabel('Ethnicities')
plt.ylabel('Gross Pay')
plt.show()

ETHNICITY
White (Not of Hispanic Origin)    2.645734e+08
Black                             4.797299e+07
Asian or Pacific Islander         3.593090e+06
Hispanic                          2.602029e+06
Two or More Races                 1.626565e+06
American Indian/Alaskan Native    2.620363e+05
Name: GROSS_PAY, dtype: float64


# Group by 'Gender' and calculate the sum of 'Gross Pay'
sum_grosspay_gender = alleghemy_county_employee_salary_df.groupby('SEX')['GROSS_PAY'].sum()

# Sort the values in descending order and select the top ten
gender_pay = sum_grosspay_gender.sort_values(ascending=False).head(5)

# Display the result
print(gender_pay)

# Plot the top ten counties
gender_pay.plot(kind='bar', color='purple', figsize=(10, 6))
plt.title('Salaries by Gender')
plt.xlabel('Gender')
plt.ylabel('Gross Pay')
plt.show()

SEX
M    1.827419e+08
F    1.378882e+08
Name: GROSS_PAY, dtype: float64

	FIRST_NAME	LAST_NAME	Combo Name	DEPARTMENT	JOB_TITLE	DATE_STARTED	SEX	ETHNICITY	ORIG_START	DATE_TERM	PAY_STATUS	ANNUAL_SALARY	REGULAR_PAY	OVERTIME_PAY	INCENTIVE_PAY	GROSS_PAY	Unnamed: 17
0	CATHERINE	ABALO	ABALO, CATHERINE	Kane Regional Centers	NURSING ASSISTANT	4/8/2010	F	Black	7/6/2009	NaN	Active	41,581.07	40,740.50	23,416.98	125.00	64,282.48	NaN
1	KEVIN J	ABBOTT	ABBOTT, KEVIN J	Emergency Management	FIRE INSTRUCTOR - PART TIME	6/11/2018	M	White (Not of Hispanic Origin)	6/11/2018	NaN	Active	30,576.00	378.00	-	-	378.00	NaN
2	JOY M	ABBOTT	ABBOTT, JOY M	Kane Regional Centers	COOK	2/14/1999	F	White (Not of Hispanic Origin)	2/2/1998	NaN	Active	40,761.76	44,015.85	1,923.77	225.00	46,164.62	NaN
3	ELIZABETH S	ABRAHAM	ABRAHAM, ELIZABETH S	Parks	SEASONAL AIDE	5/26/2022	F	White (Not of Hispanic Origin)	5/26/2022	9/5/2022	Terminated	39,936.00	6,488.00	-	100.00	6,588.00	NaN
4	JASMINE	ABRAM	ABRAM, JASMINE	Emergency Management	TELECOMMUNICATION OFFICER	5/23/2022	F	Black	5/31/2016	NaN	Active	52,240.03	25,637.96	15,994.95	125.00	41,757.91	NaN

Return Home

Allegheny County Employee Salaries EDA¶

1. What Department has the number of Employees¶

2 . Which Job Title has the highest number of Employees¶

3 . Which Ethnicity has the highest number of Employees¶

4 . What is the Average gross pay¶

5 . What is the Highest gross pay¶

6 . Rank the Top Ten Highest Paid Departments¶

7 . Rank the Top Ten Highest Paid Job Titles¶

7 . Rank the Top Ten Highest Ethnicities¶

8 . Display Salaries per Gender¶